import catboost
import pandas as pd
import numpy as np
import plotly_express as px
from scipy import stats
trainset = pd.read_parquet("data/numerai_training_data.parquet")
validset = pd.read_parquet("data/numerai_validation_data.parquet")
feature_names = [f for f in trainset.columns if "feature_" in f]
params = {
"iterations":1000,
"learning_rate":0.01,
"depth":6,
"task_type":'GPU',
"verbose":False,
}
model = catboost.CatBoostRegressor(**params)
model.fit(trainset[feature_names], trainset["target"])
validset["base_preds"] = model.predict(validset[feature_names])
# this is Numerai's code for feature neutralization:
# https://forum.numer.ai/t/model-diagnostics-feature-exposure/899
def get_biggest_change_features(corrs, n):
all_eras = corrs.index.sort_values()
h1_eras = all_eras[:len(all_eras) // 2]
h2_eras = all_eras[len(all_eras) // 2:]
h1_corr_means = corrs.loc[h1_eras, :].mean()
h2_corr_means = corrs.loc[h2_eras, :].mean()
corr_diffs = h2_corr_means - h1_corr_means
worst_n = corr_diffs.abs().sort_values(ascending=False).head(n).index.tolist()
return worst_n
def neutralize(df,
columns,
neutralizers=None,
proportion=1.0,
normalize=True,
era_col="era"):
if neutralizers is None:
neutralizers = []
unique_eras = df[era_col].unique()
computed = []
for u in unique_eras:
df_era = df[df[era_col] == u]
scores = df_era[columns].values
if normalize:
scores2 = []
for x in scores.T:
x = (stats.rankdata(x, method='ordinal') - .5) / len(x)
x = stats.norm.ppf(x)
scores2.append(x)
scores = np.array(scores2).T
exposures = df_era[neutralizers].values
scores -= proportion * exposures.dot(
np.linalg.pinv(exposures.astype(np.float32)).dot(scores.astype(np.float32)))
scores /= scores.std(ddof=0)
computed.append(scores)
return pd.DataFrame(np.concatenate(computed),
columns=columns,
index=df.index)
neutralization_proportions = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
n_riskiests = [20, 50, 150, 300, 500, 700, 900, len(feature_names)]
all_feature_corrs = validset.groupby("era").apply(
lambda era: era[feature_names].corrwith(era["base_preds"])
)
y_corr_outputs = []
y_sharpe_outputs = []
for i, neutralization_proportion in enumerate(neutralization_proportions):
x_corr_outputs = []
x_sharpe_outputs = []
for j, n_riskiest in enumerate(n_riskiests):
riskiest_features = get_biggest_change_features(all_feature_corrs, n_riskiest)
validset["modified_preds"] = neutralize(
validset,
["base_preds"],
neutralizers=riskiest_features,
proportion=neutralization_proportion,
normalize=True,
)
era_wise_correlations = validset.groupby("era").apply(
lambda era: np.corrcoef(era["modified_preds"], era["target"])[0, 1]
)
mean_corr = era_wise_correlations.mean()
# calculate sharpe ratio
sharpe_ratio = (mean_corr) / era_wise_correlations.std()
x_corr_outputs.append(mean_corr)
x_sharpe_outputs.append(sharpe_ratio)
y_corr_outputs.append(x_corr_outputs)
y_sharpe_outputs.append(x_sharpe_outputs)
corr_outputs = np.array(y_corr_outputs)
sharpe_outputs = np.array(y_sharpe_outputs)
# heatmap
# aspect ratio square
# display values inside each cell
fig = px.imshow(corr_outputs,
x=n_riskiests,
y=neutralization_proportions,
color_continuous_scale=px.colors.sequential.Plasma,
title="Correlation with Targets",
labels={"x": "N Risky Features", "y": "Neutralization Proportion"},
width=800,
height=800,
aspect=1,
)
fig.show(renderer = "notebook")
fig = px.imshow(sharpe_outputs,
x=n_riskiests,
y=neutralization_proportions,
color_continuous_scale=px.colors.sequential.Plasma,
title="Sharpe Ratios",
labels={"x": "N Risky Features", "y": "Neutralization Proportion"},
width=800,
height=800,
aspect=1,
)
fig.show(renderer = "notebook")